Import some packages we will use below


In [1]:
import os, sys, functools, json
from collections import namedtuple

import numpy as np
from numpy.random import random_integers

import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
from pandas import DataFrame
pd.set_option('display.max_colwidth',100)
pd.set_option('display.float_format', '{:.3g}'.format)

import scipy
import scipy.optimize

import sklearn

lib_path = os.path.abspath('../hammer')
sys.path.append(lib_path)
import hammer

Load our scraped data and check it looks ok


In [2]:
j = json.load(open("../scrape/items.json", "r"))

In [28]:
[j[i] for i in random_integers(0, len(j), 4)]


Out[28]:
[{u'langs': [u'Lua', u'R'],
  u'slug': u'i-can-imagine-this-will-be-a-popular-language-in-t',
  u'votes': [4, 1]},
 {u'langs': [u'Scala', u'Common Lisp'],
  u'slug': u'when-i-run-into-problems-my-colleagues-can-provide',
  u'votes': [21, 14]},
 {u'langs': [u'C', u'Clojure'],
  u'slug': u'code-written-in-this-language-tends-to-be-verbose',
  u'votes': [124, 14]},
 {u'langs': [u'D', u'Io'],
  u'slug': u'if-this-language-didn-t-exist-i-would-have-trouble',
  u'votes': [2, 0]}]

Set up the questions and languages of interest


In [29]:
def lang_gen(j):
    for el in j:
        for lang in el["langs"]:
            yield lang
language_set = set(lang_gen(j))

def questions_gen(j):
    for el in j:
        yield el["slug"]
question_set = set(questions_gen(j))

language_set.difference_update(set(["REBOL", "Agda"]))
# We use languages and questions as array dimensions below, so standardize an ordering
languages = sorted(language_set)
questions = sorted(question_set)

Convert the results to a dict format: question->result matrix


In [30]:
PollResult = namedtuple("PollResult", 
                        "question langs votes")

def result_from_json(el, question_set, language_set):
    if not (el["slug"] in question_set 
            and el["langs"][0] in language_set 
            and el["langs"][1] in language_set):
        return None
    return PollResult(el["slug"], el["langs"], el["votes"])    

def create_resdict(languages, questions, j):
    langpos = { lang:i  for (i,lang) in enumerate(languages)  }

    res_dict = {q: np.zeros([len(languages), 
                             len(languages)]) 
                for q in questions }
    
    question_set = set(questions)
    language_set = set(languages)
    for el in j:
        pollresult = result_from_json(el, question_set, language_set)

        if not pollresult:
            continue
            
        pos0 = langpos[pollresult.langs[0]]
        pos1 = langpos[pollresult.langs[1]]
        res_dict[pollresult.question][pos0,pos1] = pollresult.votes[0]
        res_dict[pollresult.question][pos1,pos0] = pollresult.votes[1]
        
    return res_dict

In [31]:
res_dict = create_resdict(languages, questions, j)

For each question, calculate ELO scores for each language


In [38]:
def expected_wins(dr):
    """ELO expected wins formula"""
    return 1.0 / (np.power(10,-dr/400.0) + 1.0)

def win_ratios(elo_scores):
    """In this set up the (i,j)th output is >.5 if S[i]>S[j] """
    XX,YY = np.meshgrid(elo_scores, elo_scores)
    return expected_wins(YY-XX)

def elo_scores(results):
    matches = results+results.T
    def elo_error(elo_scores):
        return np.sum(results-matches*win_ratios(elo_scores),
                      axis=1)

    S = scipy.optimize.leastsq(elo_error,
                               np.zeros([results.shape[0]]))
    return S[0]-np.average(S[0], weights=matches.sum(axis=0))

In [45]:
def create_scores(languages, questions, res_dict):
    res = np.zeros((len(languages), len(questions)))
    for (i,question) in enumerate(questions):
        res[:,i] = elo_scores(res_dict[question])
    return DataFrame(res, index=languages, columns=questions)

In [46]:
scores = create_scores(languages, questions, res_dict)

# Give nicely formatted names to the columns of `scores`
dfq = DataFrame.from_csv('questions2.csv').sort()
scores.columns = dfq.question_text

print "Created scores with shape {} for {} languages and {} questions".format(
scores.shape, scores.shape[0], scores.shape[1])
[scores.index,
 scores.columns]


Created scores with shape (48, 111) for 48 languages and 111 questions
Out[46]:
[Index([u'APL', u'AWK', u'Ada', u'Assembler', u'C', u'C#', u'C++', u'Clojure', u'Cobol', u'Common Lisp', u'Coq', u'D', u'Delphi', u'ELisp', u'Eiffel', u'Erlang', u'F#', u'Factor', u'Forth', u'Fortran', u'Go', u'Groovy', u'Haskell', u'Haxe', u'Io', u'J', u'Java', u'Javascript', u'Lua', u'Mathematica', u'Matlab', u'Mozart-Oz', u'O'Caml', u'Objective C', u'PHP', u'Pascal', u'Perl', u'Prolog', u'Python', u'R', u'Ruby', u'Scala', u'Scheme', u'Shell', u'Smalltalk', u'Standard ML', u'TCL', u'Visual Basic'], dtype='object'),
 Index([u'Code written in this language is very readable', u'Code written in this language tends to be terse', u'Code written in this language tends to be verbose', u'Code written in this language tends to be very reliable', u'Code written in this language will usually run in all the major implementations if it runs in one of them.', u'Developers who primarily use this language often burn out after a few years', u'I am reluctant to admit to knowing this language', u'I am sometimes embarrassed to admit to my peers that I know this language', u'I can imagine this will be a popular language in twenty years time', u'I can imagine using this language in my day job', u'I enjoy playing with this language but would never use it for real code', u'I enjoy using this language', u'I find code written in this language very elegant', u'I find it easy to write efficient code in this language', u'I find this language easy to prototype in', u'I know many other people who use this language', u'I know this language well', u'I learned this language early in my career as a programmer', u'I often feel like I am not smart enough to write this language', u'I often get angry when writing code in this language', u'I often write things in this language with the intent of rewriting them in something else later', u'I rarely have difficulty abstracting patterns I find in my code', u'I regularly use this language', u'I still discover new features of this language on a fairly regular basis', u'I use a lot of code written in this language which I really don't want to have to make changes to', u'I use many applications written in this language', u'I use this language out of choice', u'I usually use this language on projects with many other members', u'I usually use this language on solo projects', u'I would like to write more of this language than I currently do', u'I would list this language on my resume', u'I would recommend most programmers learn this language, regardless of whether they have a specific need for it', u'I would use this language as a scripting language embedded inside a larger application', u'I would use this language for a desktop GUI project', u'I would use this language for a web project', u'I would use this language for casual scripting', u'I would use this language for mobile applications', u'I would use this language for writing embedded programs', u'I would use this language for writing programs for an embedded hardware platform', u'I would use this language for writing server programs', u'I would use this language to write a command-line app', u'If my code in this language successfully compiles, there is a good chance my code is correct.', u'If this language didn't exist, I would have trouble finding a satisfactory replacement', u'It is easy to debug programs written in this language when it goes wrong', u'It is easy to tell at a glance what code in this language does', u'It is too easy to write code in this language that looks like it does one thing but actually does something else', u'Its unusual for me to discover unfamiliar features"', u'Learning this language improved my ability as a programmer', u'Learning this language significantly changed how I use other languages.', u'Libraries in this language tend to be well documented.', u'Programs written in this language tend to be efficient', u'Programs written in this language tend to play well with others', u'Programs written in this language will usually work in future versions of the language', u'The resources for learning this language are of high quality', u'The semantics of this language are much different than other languages I know.', u'The thought that I may still be using this language in twenty years time fills me with dread', u'There are many good commercial tools for this language', u'There are many good open-source tools for this language', u'There are many good tools for this language', u'There is a lot of accidental complexity when writing code in this language', u'There is a wide variety of open source code written in this language', u'Third-party libraries are readily available, well-documented, and of high quality', u'This is a high level language', u'This is a low level language', u'This is a mainstream language', u'This language allows me to write programs where I know exactly what they are doing under the hood', u'This language encourages writing code that is easy to maintain.', u'This language encourages writing reusable code.', u'This language excels at concurrency', u'This language excels at symbolic manipulation', u'This language excels at text processing', u'This language has a good community', u'This language has a good library distribution mechanism.', u'This language has a high quality implementation', u'This language has a niche in which it is great', u'This language has a niche outside of which I would not use it', u'This language has a strong static type system', u'This language has a very coherent design', u'This language has a very dogmatic community', u'This language has a very rigid idea of how things should be done', u'This language has a wide variety of agreed-upon conventions, which are generally adhered to reasonably well, and which increase my productivity', u'This language has an annoying syntax', u'This language has many features which feel tacked on', u'This language has unusual features that I often miss when using other languages', u'This language has well-organized libraries with consistent, carefully thought-out interfaces', u'This language is best for very large projects', u'This language is best for very small projects', u'This language is built on a small core of orthogonal features', u'This language is easier to use for its problem domain by removing unneeded expressiveness (such as not being Turing complete).', u'This language is expressive', u'This language is frequently used for applications it isn't suitable for', u'This language is good for beginners', u'This language is good for distributed computing', u'This language is good for numeric computing', u'This language is good for scientific computing', u'This language is large', u'This language is likely to be a passing fad', u'This language is likely to be around for a very long time', u'This language is likely to have a strong influence on future languages', u'This language is minimal', ...], dtype='object')]

Do a PCA decomposition of variance

scores has shape (len(languages), len(questions)) so pca selects axes in question-space


In [65]:
pca        = sklearn.decomposition.PCA(n_components=4)
langpcafit = pca.fit(scores)

num_components = langpcafit.components_.shape[0]
component_labels = ["C" + str(i) for i in range(0,num_components)]

langpcc = DataFrame(pca.transform(scores),
                    index   = languages,
                    columns = component_labels)

langpcs = DataFrame(langpcafit.components_.transpose(), 
                    index   = dfq.question_text,
                    columns = component_labels)

Plot how much variance our components explain


In [66]:
cevr = np.cumsum(langpcafit.explained_variance_ratio_)
plt.plot(cevr, 'ro')
plt.grid()
plt.xlabel('Component')
plt.ylim([0, 1])
plt.ylabel('Cumulative explained variance')
cevr


Out[66]:
array([ 0.32055701,  0.52114888,  0.64450098,  0.7214947 ])

Plot a scattergraph of languages in component-space


In [67]:
def scatter_simple(X,Y,**kwargs):
    plt.figure(figsize=(7,7))
    plt.scatter(X, Y, marker='.', color='y', **kwargs)
    plt.grid()
    plt.gray()

def annotate_scatter(labels,X,Y):
    for label, x, y in zip(labels, X, Y):
        plt.annotate(
            label, 
            xy = (x, y), xytext = (0, 0),
            textcoords = 'offset points', 
            ha = 'center', 
            va = 'center',
            bbox = dict(boxstyle = 'round,pad=0', 
                        fc       = 'yellow', 
                        ec       = 'yellow',
                        alpha    = .2),
            size = 12)
        
def scatter_components(df, CX="C0", CY="C1", **kwargs):
    scatter_simple(df[CX],df[CY],**kwargs)
    annotate_scatter(df.index, df[CX],df[CY])
    plt.title('Languages in component space')
    plt.xlabel(CX + ' (ELO points)')
    plt.ylabel(CY + ' (ELO points)')
    max_abs_score=df.abs().max().max()
    plt.gca().set_xlim([-max_abs_score, max_abs_score])
    plt.gca().set_ylim([-max_abs_score, max_abs_score])

In [95]:
scatter_components(langpcc)
scatter_components(langpcc, "C2", "C3")



In [97]:
def headtail(df, n):
    return pd.concat([df.head(n), df.tail(n)])
def describe_component(langpcs, cpt):
    return DataFrame(langpcs[cpt]).sort(cpt, ascending=False)
headtail(describe_component(langpcs, "C0"),
         5)


Out[97]:
C0
question_text
This language encourages writing reusable code. 0.191
This language has unusual features that I often miss when using other languages 0.184
I find code written in this language very elegant 0.175
This language is likely to have a strong influence on future languages 0.175
This language is expressive 0.174
I often get angry when writing code in this language -0.12
This is a low level language -0.12
This language makes it easy to shoot yourself in the foot -0.138
The thought that I may still be using this language in twenty years time fills me with dread -0.145
I learned this language early in my career as a programmer -0.179

10 rows × 1 columns

Describe some of the main features of each component


In [101]:
def cpt_summary(langpcs, cpts):
    df2 = DataFrame()
    for cpt in cpts:
        df2[cpt] = list(DataFrame(langpcs[cpt]).sort(cpt, ascending=False).index)
    return df2
headtail(cpt_summary(langpcs, component_labels),3)


Out[101]:
C0 C1 C2 C3
0 This language encourages writing reusable code. The semantics of this language are much different than other languages I know. This language has a strong static type system This language makes it easy to shoot yourself in the foot
1 This language has unusual features that I often miss when using other languages This language is built on a small core of orthogonal features This language is suitable for real-time applications This is a low level language
2 I find code written in this language very elegant I often feel like I am not smart enough to write this language Programs written in this language tend to be efficient I would use this language for writing programs for an embedded hardware platform
108 This language makes it easy to shoot yourself in the foot There is a wide variety of open source code written in this language This language excels at text processing This is a high level language
109 The thought that I may still be using this language in twenty years time fills me with dread I use many applications written in this language I would use this language as a scripting language embedded inside a larger application If my code in this language successfully compiles, there is a good chance my code is correct.
110 I learned this language early in my career as a programmer This is a mainstream language I would use this language for casual scripting This language has a strong static type system

6 rows × 4 columns


In [104]:
def component_influences_summarize(df, FCpt, FLang, extraCols=[]):
    FLangCpt     = FCpt+"*("+FLang+")"
    AbsFLangCpt  = "Abs"+FLang+FCpt

    df[FLangCpt]    = df[FCpt]*df[FLang]
    df[AbsFLangCpt] = df[FLangCpt].abs()

    dfs = df.sort([AbsFLangCpt], ascending=False)

    return dfs[[FCpt, FLangCpt, FLang]+extraCols]

In [105]:
def component_influences(langpcs, scores, FCpt, FLang):
    """Return a data frame for the given language and component.
       
       index - question
       columns - language scores, component weight, product of these
       sorted by abs(language score * component weight) descending
    """
    df        = langpcs.copy()
    df[FLang] = scores.transpose()[FLang]

    return component_influences_summarize(df, FCpt, FLang)

component_influences(langpcs, scores, "C0", "Python").head(10)


Out[105]:
C0 C0*(Python) Python
question_text
This language encourages writing code that is easy to maintain. 0.165 34.7 210
This language encourages writing reusable code. 0.191 33.9 177
I would use this language for a web project 0.135 32.2 238
This language is well suited for an agile development approach using short iterations. 0.126 28.9 230
I would use this language as a scripting language embedded inside a larger application 0.098 28.7 293
This language would be good for teaching children to write software 0.0935 28.5 305
This language has a good library distribution mechanism. 0.112 25.5 228
This language is expressive 0.174 25.4 146
I find code written in this language very elegant 0.175 24.9 142
I would use this language for casual scripting 0.0796 23.8 300

10 rows × 3 columns


In [108]:
def component_influences_compare(langpcs, scores, FCpt, lang1, lang2):
    FLang     = lang2+'-' +lang1
    extraCols = [lang1, lang2]

    df      = langpcs.copy()
    scoresT = scores.transpose()

    df[FLang]     = scoresT[lang2]-scoresT[lang1]
    df[extraCols] = scoresT[extraCols]
    
    return component_influences_summarize(df, FCpt, FLang, extraCols)

component_influences_compare(langpcs, scores, 
                             "C0", 
                             "C++", "Python").head(6)


Out[108]:
C0 C0*(Python-C++) Python-C++ C++ Python
question_text
This language makes it easy to shoot yourself in the foot -0.138 60.4 -438 289 -149
I would use this language as a scripting language embedded inside a larger application 0.098 50.5 515 -222 293
I would use this language for a web project 0.135 50.1 371 -132 238
This is a high level language 0.15 45.7 304 -155 149
This language would be good for teaching children to write software 0.0935 45.2 483 -178 305
This language is well suited for an agile development approach using short iterations. 0.126 44.7 355 -125 230

6 rows × 5 columns


In [109]:
def comparebycpt(langpcs, scores, component, languages):
    return pd.concat([DataFrame(langpcs[component]),
                       scores.transpose()[languages]],
                       axis = 1).sort(component, ascending=False)

headtail(comparebycpt(langpcs, scores, 
                      "C0", 
                      ["C++", "Python", "F#"]),
         3)


Out[109]:
C0 C++ Python F#
question_text
This language encourages writing reusable code. 0.191 30.2 177 161
This language has unusual features that I often miss when using other languages 0.184 -52.3 96.8 222
I find code written in this language very elegant 0.175 -108 142 201
This language makes it easy to shoot yourself in the foot -0.138 289 -149 -249
The thought that I may still be using this language in twenty years time fills me with dread -0.145 114 -125 -166
I learned this language early in my career as a programmer -0.179 114 -94.2 -280

6 rows × 4 columns


In [110]:
import re
def questions_with(regex):
    return [ q for q in dfq.question_text if re.search(regex, q) ]
def highlights(df, languages=None):
    return 
qs= questions_with("readable")
scores[qs].sort(qs, ascending=False).head(10)


Out[110]:
question_text Code written in this language is very readable
Python 236
Haxe 235
Ruby 162
Eiffel 144
F# 137
Smalltalk 129
Haskell 109
C# 103
Io 96.6
Lua 92.6

10 rows × 1 columns


In [113]:
def bigdiffs(dfq, scores, language1, language2):
    df3 = pd.concat([dfq.reset_index().set_index("question_text"),
                     scores.transpose()[[language1, language2]]],
                    axis=1)
    df3["delta"]    = df3[language1]-df3[language2]
    df3["impdelta"] = df3["delta"]*df3.importance
    df3 = df3.sort("impdelta", ascending=False)
    return df3[[language1, language2, "impdelta", 
                "importance"]]
bd=bigdiffs(dfq, scores, "Python", "C++")
headtail(bd.drop("importance", axis=1),
         4)


Out[113]:
Python C++ impdelta
question_text
I would use this language as a scripting language embedded inside a larger application 293 -222 515
I would use this language for casual scripting 300 -157 457
This language is good for beginners 273 -183 456
This language makes it easy to shoot yourself in the foot -149 289 438
I learned this language early in my career as a programmer -94.2 114 -208
Programs written in this language tend to be efficient -90.8 180 -270
This language is suitable for real-time applications -79.1 238 -317
This language has a strong static type system -151 173 -324

8 rows × 3 columns


In [114]:
from pandas.tools.plotting import _get_standard_colors 
    
def parcor2(frame, class_column, cols=None, ax=None, color=None,
            use_columns=False, yticks=None, colormap=None,
            **kwds):
    """Parallel coordinates plotting.

    Parameters
    ----------
    frame: DataFrame
    class_column: str
        Column name containing class names
    cols: list, optional
        A list of column names to use
    ax: matplotlib.axis, optional
        matplotlib axis object
    color: list or tuple, optional
        Colors to use for the different classes
    use_columns: bool, optional
        If true, columns will be used as yticks
    yticks: list or tuple, optional
        A list of values to use for yticks
    colormap: str or matplotlib colormap, default None
        Colormap to use for line colors.
    kwds: keywords
        Options to pass to matplotlib plotting method

    Returns
    -------
    ax: matplotlib axis object

    Examples
    --------
    >>> from pandas import read_csv
    >>> from pandas.tools.plotting import parallel_coordinates
    >>> from matplotlib import pyplot as plt
    >>> df = read_csv('https://raw.github.com/pydata/pandas/master/pandas/tests/data/iris.csv')
    >>> parallel_coordinates(df, 'Name', color=('#556270', '#4ECDC4', '#C7F464'))
    >>> plt.show()
    """
    import matplotlib.pyplot as plt

    n = len(frame)
    classes = frame[class_column].drop_duplicates()
    class_col = frame[class_column]

    if cols is None:
        df = frame.drop(class_column, axis=1)
    else:
        df = frame[cols]

    used_legends = set([])

    ncols = len(df.columns)

    # determine values to use for xticks
    if use_columns is True:
        if not np.all(np.isreal(list(df.columns))):
            raise ValueError('Columns must be numeric to be used as xticks')
        y = df.columns
    elif yticks is not None:
        if not np.all(np.isreal(yticks)):
            raise ValueError('yticks specified must be numeric')
        elif len(yticks) != ncols:
            raise ValueError('Length of yticks must match number of columns')
        y = yticks
    else:
        y = xrange(ncols)

    if ax is None:
        ax = plt.gca()

    color_values = _get_standard_colors(num_colors=len(classes),
                                        colormap=colormap, color_type='random',
                                        color=color)

    colors = dict(zip(classes, color_values))

    for i in range(n):
        x = df.iloc[i].values
        kls = class_col.iat[i]
        label = pd.core.common.pprint_thing(kls)
        if label not in used_legends:
            used_legends.add(label)
            ax.plot(x, y, color=colors[kls], label=label, **kwds)
        else:
            ax.plot(x, y, color=colors[kls], **kwds)
    ax.grid()
    ax.set_yticks(y)
    ax.set_yticklabels(df.columns)
    ax.set_ylim(y[0], y[-1])
    ax.legend(loc='upper right')
    return ax

In [116]:
plt.figure(figsize=(6,40))
bd2=DataFrame()
bd2["impdelta"] = bd["impdelta"]
bd2["Python"]   = bd["Python"]*bd["importance"]
bd2["C++"]      = bd["C++"]*bd["importance"]
parcor2(
  bd2[["Python","C++","impdelta"]].transpose().reset_index(), 
  "index") 
xlim = plt.xlim()
plt.xlim(-max(np.abs(xlim)),+max(np.abs(xlim)))


Out[116]:
(-600.0, 600.0)

In [117]:
of_interest = (langpcc.C0 - langpcc.C1 ) > 500

st    = scores.transpose()
imp   = DataFrame(dfq.importance)
df    = DataFrame(st.values*imp.values,
                  index   = st.index, 
                  columns = st.columns)

average_scores = DataFrame(df.mean(), columns = ["mean"])

average_scores[of_interest].sort(["mean"], ascending=False)


Out[117]:
mean
Python 90.4
Haskell 82.6
Scala 73.8
Clojure 73.1
F# 50.9
Ruby 48
O'Caml 42.5
C# 42.2
Java 38.7
Common Lisp 26.7
C 20.8
Haxe 11.7
C++ -17.4

13 rows × 1 columns


In [ ]: